{
 "cells": [
  {
   "cell_type": "code",
   "execution_count": 1,
   "metadata": {
    "scrolled": false
   },
   "outputs": [
    {
     "name": "stderr",
     "output_type": "stream",
     "text": [
      "/home/pershinmr/.virtualenvs/jafar/local/lib/python2.7/site-packages/sklearn/cross_validation.py:44: DeprecationWarning: This module was deprecated in version 0.18 in favor of the model_selection module into which all the refactored classes and functions are moved. Also note that the interface of the new CV iterators are different from that of this module. This module will be removed in 0.20.\n",
      "  \"This module will be removed in 0.20.\", DeprecationWarning)\n"
     ]
    }
   ],
   "source": [
    "%matplotlib inline\n",
    "\n",
    "import numpy as np\n",
    "import pandas as pd\n",
    "import matplotlib.pyplot as plt\n",
    "import xgboost as xgb\n",
    "import tqdm\n",
    "import hyperopt\n",
    "import sys\n",
    "import scipy\n",
    "\n",
    "import lightgbm\n",
    "\n",
    "from catboost import CatBoost, CatBoostClassifier\n",
    "\n",
    "from datetime import datetime\n",
    "from sklearn.cross_validation import cross_val_score, KFold\n",
    "from sklearn.metrics import log_loss"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 2,
   "metadata": {
    "collapsed": true
   },
   "outputs": [],
   "source": [
    "train_df = pd.read_csv('train.csv', sep=';')\n",
    "test_df = pd.read_csv('test.csv', sep=';', na_values='None')"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "##  Data exploration"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 3,
   "metadata": {},
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "(70000, 13) (30000, 12)\n"
     ]
    }
   ],
   "source": [
    "print train_df.shape, test_df.shape"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 4,
   "metadata": {
    "collapsed": true
   },
   "outputs": [],
   "source": [
    "full_df = pd.concat([train_df, test_df], axis=0)"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "#####  height fix"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 5,
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/plain": [
       "<matplotlib.axes._subplots.AxesSubplot at 0x7f519c655750>"
      ]
     },
     "execution_count": 5,
     "metadata": {},
     "output_type": "execute_result"
    },
    {
     "data": {
      "image/png": "iVBORw0KGgoAAAANSUhEUgAAAZkAAAEACAYAAABhzAtFAAAABHNCSVQICAgIfAhkiAAAAAlwSFlz\nAAALEgAACxIB0t1+/AAAGOJJREFUeJzt3XuQpXV95/H3B0bwmsnoLnRlUAcQEd01hChQmqyNhptW\nQFOFhVmLi7irUaO7qSQCbjljuZsVK6zCWobsShRcES9EGTesM3GhY6wVCCIB5TaW4TIQOq4yuJiU\ngnz3j/PrmTNN98yZnn76nD79flWdmuf5Prffeer0fM/v9pxUFZIkdWGfYRdAkjS+TDKSpM6YZCRJ\nnTHJSJI6Y5KRJHXGJCNJ6kynSSbJpUmmk9w6x7bfT/JEkmf3xS5OsiXJLUmO7IufmeTuJHclOaMv\nflSSW9u2j3b5XiRJe67rmswngRNnB5McBPwGcG9f7GTg0Ko6DHgbcEmLrwHeD7wcOAZYn2R1O+xP\ngLdW1QuBFyZ50rUkScPTaZKpqm8AD8+x6SPAH8yKnQpc3o67AVid5EB6SWpzVT1SVduAzcBJSSaA\nZ1XVje34y4HXd/A2JEkLtOR9Mkl+E7i/qm6btWktcH/f+tYWmx1/oC++dY79JUkjYtVSXizJ04D3\nAcfPtXmO9Zojzm7ikqQRsaRJBjgUWAf8bZIABwE3JzmaXk3kuX37HgQ82OKTs+LX7WL/OSUxAUnS\nAlTVXF/qB7IUzWVpL6rqO1U1UVWHVNXB9BLFr1TVPwAbgTMAkhwLbKuqaWATcHyS1W0QwPHApqp6\nCPhxkqNbwjoDuHpXBakqX4vwWr9+/dDLME4v76f3c5Rfe6vrIcxXAP+H3siv+5KcPWuX7c1eVXUN\n8HdJvgf8KfCOFn8Y+CBwE3AD8IHqDQCg7XMpcDewpaq+2uX7kSTtmU6by6rqt3ez/ZBZ6++aZ79P\nAZ+aI/4t4F8uvISSpC454197bHJycthFGCvez8Xl/RwtWYw2t+UgSa2U9ypJiyUJNeId/5KkFcok\nI0nqjElGktQZk4wkqTMmGUlSZ0wykqTOmGSkMTcxsY4kJGFiYt2wi6MVxnky0pjrPdpv5rOfRXke\nlVYO58lIkkaWSUaS1BmTjCSpMyYZSVJnTDKSpM6YZCRJnTHJSJI6Y5KRJHXGJCNJ6oxJRpLUGZOM\nJKkzJhlJUmdMMpKkznSaZJJcmmQ6ya19sQ8nuSPJLUmuSvILfdvOS7KlbT+hL35SkjuT3J3kvX3x\ndUmuT3JXks8mWdXl+5Ek7ZmuazKfBE6cFdsMvKSqjgS2AOcBJHkx8EbgCOBk4OPp2Qf4WDvPS4A3\nJXlRO9cFwIVVdTiwDTin4/cjSdoDnSaZqvoG8PCs2Neq6om2ej1wUFs+Bbiyqh6vqnvoJaCj22tL\nVd1bVY8BVwKntmNeDVzVli8D3tDVe5Ek7blh98m8BbimLa8F7u/b9kCLzY5vBdYmeQ7wcF/C2gr8\nUrfFlSTtiaH1YSR5H/BYVX12JjTHbsXcibDa/rOP2eVP/m3YsGH78uTkJJOTkwOWVpJWhqmpKaam\nphbtfJ3//HKS5wNfqaqX9sXOBP4t8Oqq+mmLnQtUVV3Q1r8KrKeXSDZU1Umz90vyA+DAqnoiybHA\n+qo6eZ5y+PPLWpH8+WXtjeXw88s71TiSnAT8IXDKTIJpNgKnJ9kvycHAC4Abgb8BXpDk+Un2A04H\nrm7HXAuc1pbP7ItLkkZApzWZJFcAk8BzgGl6NZPzgf2AH7bdrq+qd7T9z6M3Quwx4D1VtbnFTwIu\nopcUL62qD7X4wfQGAqwBvg28uQ0OmKss1mS0IlmT0d7Y25pM581lo8Iko5XKJKO9sRyayyRJK5RJ\nRpLUGZOMJKkzJhlJUmdMMpKkzphkJEmdMclIkjpjkpEkdcYkI0nqjElGktQZk4wkqTMmGUlSZ0wy\nkqTOmGQkSZ0xyUiSOmOSkSR1xiQjSeqMSUaS1BmTjCSpMyYZSVJnTDKSpM6YZCRJnTHJSJI602mS\nSXJpkukkt/bF1iTZnOSuJJuSrO7bdnGSLUluSXJkX/zMJHe3Y87oix+V5Na27aNdvhdp3ExMrCPJ\n9tfExLphF0ljqOuazCeBE2fFzgW+VlWHA9cC5wEkORk4tKoOA94GXNLia4D3Ay8HjgHW9yWmPwHe\nWlUvBF6YZPa1JM1jevpeoLa/euvS4uo0yVTVN4CHZ4VPBS5ry5e19Zn45e24G4DVSQ6kl6Q2V9Uj\nVbUN2AyclGQCeFZV3diOvxx4fWdvRpK0x4bRJ3NAVU0DVNVDwAEtvha4v2+/rS02O/5AX3zrHPtL\nkkbEqmEXoE/mWK854uwmPq8NGzZsX56cnGRycnKPCihJ425qaoqpqalFO1+qdvn/8t5fIHk+8JWq\nemlbvwOYrKrp1uR1XVUdkeSStvy5tt+dwKuA49r+b2/xS4DrgL+aObbFTwdeVVW/M085quv3Ko2i\nZOb7GkCY+TvYOb7zNmlGEqpqri/1A1mK5rKwc61jI3BWWz4LuLovfgZAkmOBba1ZbRNwfJLVbRDA\n8cCm1tT24yRHp/fXckbfuSRJI6DT5rIkVwCTwHOS3AesBz4EfCHJW4D7gNMAquqaJK9N8j3gJ8DZ\nLf5wkg8CN9H72vWBNgAA4B3Ap4CnAtdU1Ve7fD+SpD3TeXPZqLC5TCuVzWXaG8uhuUyStEKZZCRJ\nnTHJSGOg/xExPh5Go8Q+GWkMzNfvsqtt9sloEPbJSJJGlklGktQZk4wkqTMmGUlSZ0wykqTOmGQk\nSZ0xyUiSOmOSkSR1xiQjSeqMSUaS1BmTjCSpMyYZSVJnTDKSpM6YZCRJnTHJSGr29zdptOgGSjJJ\n/kXXBZE0bD+l9/syxfT0vcMujMbEoDWZS5LcmOQdSX6x0xJJksbGQEmmqn4N+NfAc4GbklyR5PhO\nSyZJWvb26OeXk+wLvB64GPgxEOD8qvrzboq3ePz5ZY2zxfr55fnOoZVrSX5+OclLk3wEuAN4NfCb\nVXVEW/7IQi6c5N8n+U6SW5N8Jsl+SdYluT7JXUk+m2RV23e/JFcm2ZLkm0me13ee81r8jiQnLKQs\nkqRuDNon8zHgZuCXq+qdVXUzQFU9CPyHPb1okl8Cfhc4qqpeCqwC3gRcAFxYVYcD24Bz2iHnAD+q\nqsOAjwIfbud5MfBG4AjgZODj6X09k1awHaPE/HPQsA2aZF4LXFFV/wSQZJ8kTweoqk8v8Nr7As9o\ntZWnAQ8CxwFXte2X0WuaAzi1rQN8kV4NCuAU4Mqqeryq7gG2AEcvsDzSmNgxSmzn5jBp6Q2aZL5G\nLxHMeHqLLUirAV0I3Ac8ADxCr6a0raqeaLttBda25bXA/e3YnwOPJHl2f7x5oO8YSdKQrRpwv6dW\n1aMzK1X16ExNZiHaMOhTgefTSzBfoNfcNduOXsi5t80Xn9OGDRu2L09OTjI5OTlQeSVppZiammJq\namrRzjdokvlJkqNm+mKS/CrwT3tx3d8Avl9VP2rn+xLwCuAXk+zTajMH0WtCg16t5rnAg22E2+qq\nejjJTHxG/zFP0p9kJElPNvsL+Ac+8IG9Ot+gzWX/DvhCkr9O8tfA54B37cV17wOOTfLU1lH/GuC7\nwHXAaW2fM4Gr2/LGtk7bfm1f/PQ2+uxg4AXAjXtRLknSIhp4nkySpwCH02uiurOqHturCyfrgdOB\nx4BvA2+lVxO5EljTYm+uqseS7A98GvgV4IfA6a2jnyTn0Rt99hjwnqraPM/1nCejsTV7Lswg81+c\nJ6NB7O08mT1JMq8A1tHXxFZVly/0wkvNJKNxZpJRV/Y2yQzUJ5Pk08ChwC3Az1u4gGWTZCRJS2/Q\njv+XAS+2KiCNhomJdT4pWcvCoB3/3wEmuiyIpMH1EowTLjX6Bq3J/DPg9iQ30ptODEBVndJJqSRJ\nY2HQJLOhy0JIksbTnowuez5wWFV9rc3237eq/l+npVtEji7TONndyDBHl2mxLNWj/v8NvQdT/mkL\nrQW+vNCLSpJWhkE7/t8JvJLeD5VRVVuAA7oqlCRpPAyaZH5aVT+bWWmP57cuLUnapUGTzF8lOR94\nWpLj6T01+SvdFUuSNA4G6vhPsg+954OdQK93cBPwieXUk27Hv8aJHf9aKkv27LLlziSjcWKS0VJZ\nqmeX/R1z9MFU1SELvbAkafztybPLZjyV3m+6PHvxiyNJGicLbi5L8q2q+tVFLk9nbC7TOLG5TEtl\nqZrLjupb3YdezWbQWpAkaYUaNFFc2Lf8OHAP8MZFL40kaaw4ukxahmwu01JZquay39vV9qr6Lwst\ngCRpfA064/9lwO/QezDmWuDtwFHAs9pLUscmJtaRpNVAFmr/RTiHNLhBZ/x/HXjdzKP9kzwL+Iuq\n+lcdl2/R2Fym5W7n5q2FN5ftabOaVrYledQ/cCDws771n7WYJEnzGnR02eXAjUm+RO+rzhuAyzor\nlSRpLAxUk6mq/wScDTwMbAPOrqo/2psLJ1md5AtJ7kjy3STHJFmTZHOSu5JsSrK6b/+Lk2xJckuS\nI/viZya5ux1zxt6USZK0uAZtLgN4OvDjqroI2Jrk4L289kXANVV1BPDLwJ3AucDXqupw4FrgPIAk\nJwOHVtVhwNuAS1p8DfB+4OXAMcD6/sQkSRquQX9+eT3wXtp/+sBTgP+x0Iu2gQO/XlWfBKiqx6vq\nEeBUdjTDXdbWaf9e3va9AVid5EDgRGBzVT1SVduAzcBJCy2XJGlxDVqTeQNwCvATgKp6kL0bunwI\n8H+TfDLJzUn+W5KnAwdW1XS7xkPs+InntcD9fcdvZcdw6v74Ay0mSRoBg3b8/6yqKkkBJHnGIlz3\nKOCdVXVTko/Qayqbb8zk7OFzM2Mt5xpWN++4yw0bNmxfnpycZHJycvASS9IKMDU1xdTU1KKdb9B5\nMr8PHAYcD/xn4C3AFVX1Xxd00V5T1zdnfo8mya/RSzKHApNVNZ1kAriuqo5Icklb/lzb/07gVcBx\nbf+3t/hO+826pvNktKw5T0bDsCTzZKrqj4EvAlcBhwPvX2iCaeebBu5P8sIWeg3wXWAjcFaLnQVc\n3ZY3AmcAJDkW2NbOsQk4vo1UW0MvCW5aaLkkSYtrt81lSfalN+LrOOAvF/Ha7wY+k+QpwPfpDZHe\nF/h8krcA99H7cTSq6pokr03yPXr9Qme3+MNJPgjcRO8r2AfaAABJ0ggYtLnsfwO/1UaALUs2l2m5\ns7lMw7AkT2EGHgVuS/KXtBFmAFX17oVeWJI0/gZNMn/eXpIkDWyXzWVJnldV9y1heTpjc5mWO5vL\nNAxdjy77ct+FrlroRSRJK9Pukkx/9jqky4JIksbP7pJMzbMsSdJu7a5P5uf0RpMFeBrwjzObgKqq\nX+i8hIvEPhktd/bJaBg6HcJcVfsu9MSSJO3J78lIkrRHTDKSpM6YZCRJnTHJSJI6Y5KRJHXGJCNJ\n6oxJRtIuTUysI8n218TEumEXScvIQL8nMw6cjKnlbliTMXe+7s7bNP6W5OeXJUlaCJOMNML6m6qk\n5cjmMmmEzd9EZnOZlobNZZKkkTXozy9LWlH2t4lOi8KajKQ5/JReE5nNYto7JhlJUmeGmmSS7JPk\n5iQb2/q6JNcnuSvJZ5OsavH9klyZZEuSbyZ5Xt85zmvxO5KcMKz3Ikl6smHXZN4D3N63fgFwYVUd\nDmwDzmnxc4AfVdVhwEeBDwMkeTHwRuAI4GTg47EhWZJGxtCSTJKDgNcCn+gLvxq4qi1fBry+LZ/a\n1gG+2PYDOAW4sqoer6p7gC3A0R0WW5K0B4ZZk/kI8Ae0nsUkzwEerqon2vatwNq2vBa4H6Cqfg48\nkuTZ/fHmgb5jJElDNpQhzEleB0xX1S1JJmfC7dWvf9bYbLWL+Jw2bNiwfXlycpLJycn5dpWkFWlq\naoqpqalFO99QZvwn+SPgzcDjwNOAZwFfBk4AJqrqiSTHAuur6uQkX23LNyTZF/j7qjogyblAVdUF\n7bzb95vjms7417IzzBn/u9rPv6WVY1nO+K+q86vqeVV1CHA6cG1VvRm4Djit7XYmcHVb3tjWaduv\n7Yuf3kafHQy8ALhxKd6DJGn3Rm3G/7nAlUk+CHwbuLTFLwU+nWQL8EN6iYmquj3J5+mNUHsMeIfV\nFUkaHT4gUxphNpdp2JZlc5kkaWUwyUiSOmOSkSR1xiQjSeqMSUaS1BmTjCSpMyYZSVJnTDKSpM6Y\nZKQRMjGxjiTbX9Jy54x/aYTsPMMf9n62/mKcwxn/K5kz/iVJI8skI0nqjElGktQZk4wkqTMmGUlS\nZ0wykqTOmGQkSZ0xyUiSOmOSkSR1xiQjSeqMSUaS1BmTjDRk/Q/FXG76yz4xsW7YxdEI8gGZ0pDt\n/FDM5fWAzNll929s/CzLB2QmOSjJtUluT3Jbkne3+Jokm5PclWRTktV9x1ycZEuSW5Ic2Rc/M8nd\n7ZgzhvF+pJVl/2Vb89LSG0pNJskEMFFVtyR5JvAt4FTgbOCHVfXhJO8F1lTVuUlOBt5VVa9Lcgxw\nUVUdm2QNcBNwFL2vW98CjqqqR+a4pjUZjaTlWJOZbz//xsbPsqzJVNVDVXVLW34UuAM4iF6iuazt\ndllbp/17edv/BmB1kgOBE4HNVfVIVW0DNgMnLdkbkSTt0tA7/pOsA44ErgcOrKpp6CUi4IC221rg\n/r7DtrbY7PgDLSZJGgGrhnnx1lT2ReA9VfVokvnq2rOrajN19LmqcPPW1zds2LB9eXJyksnJyT0p\nriSNvampKaamphbtfEMbXZZkFfA/gf9VVRe12B3AZFVNt36b66rqiCSXtOXPtf3uBF4FHNf2f3uL\n77TfrOvZJ6ORZJ+MRtmy7JNp/gy4fSbBNBuBs9ryWcDVffEzAJIcC2xrzWqbgOOTrG6DAI5vMUnS\nCBjW6LJXAl8HbqP3NaiA84Ebgc8DzwXuA05rHfok+Ri9Tv2fAGdX1c0tfhbwvnaO/1hVl89zTWsy\nGknWZDTK9rYm42RMachMMhply7m5TJI05kwykqTOmGQkSZ0xyUhaJDueaeZTmTXDjn9pyMap43++\npzVr+bLjX5I0skwykqTOmGQkSZ0xyUiSOmOSkSR1xiQjSeqMSUaS1BmTjDQEExPrtk9alMaZkzGl\nIZh/AqaTMTVanIwpSRpZJhlJUmdMMpI6sr8PyxSrhl0ASePqp8z00UxPO8BhpbImIy2B/tFkjijT\nSmKSkZbA9PS99L7Vz7xWGpvOViqbyyQtAZvOViprMpKkzoxFkklyUpI7k9yd5L3DLo8EzuqXYAyS\nTJJ9gI8BJwIvAd6U5EXDLdV4m5qaGnYRloWd+2G0w/47DYJY7D4aP5+jZdknGeBoYEtV3VtVjwFX\nAqcOuUxjzT9i7Z2Z/pneq5eMF4+fz9EyDklmLXB/3/rWFpM6N3to8r77PsMmsj3myLNxNg5JZq6/\n5jnbJ2Y+yH7TGQ/9/8Ev9D+nQc/Rv19/Ipk9NPmJJ/4Rm8j21I6azfT0Q3PeZxPQ8rXsn8Kc5Fhg\nQ1Wd1NbPBaqqLpi13/J+o5I0JHvzFOZxSDL7AncBrwH+HrgReFNV3THUgkmSlv9kzKr6eZJ3AZvp\nNf9daoKRpNGw7GsykqTRNQ4d/0+S5J4kf5vk20lubLE1STYnuSvJpiSrh13OUZXk0iTTSW7ti817\n/5JcnGRLkluSHDmcUo+uee7n+iRbk9zcXif1bTuv3c87kpwwnFKPpiQHJbk2ye1Jbkvy7hb387kA\nc9zP323xxft8VtXYvYDvA2tmxS4A/rAtvxf40LDLOaov4NeAI4Fbd3f/gJOBv2jLxwDXD7v8o/aa\n536uB35vjn2PAL5Nryl7HfA9WouDrwKYAI5sy8+k1x/7Ij+fi34/F+3zOZY1GXrDmme/t1OBy9ry\nZcDrl7REy0hVfQN4eFZ49v07tS9+eTvuBmB1kgOXopzLxTz3E+Yefn8qcGVVPV5V9wBb6E04FlBV\nD1XVLW35UeAO4CD8fC7IPPdzZp7honw+xzXJFLApyd8keWuLHVhV09C7scA/H1rplqcDZt2/A1p8\n9mTYB3Ay7KDe2ZpwPtHXvOP9HFCSdfRqiNfz5L9vP597qO9+3tBCi/L5HNck84qqehnwWno36tdx\ndlxXBp4Mq518HDi0qo4EHgIubHHv5wCSPBP4IvCe9g18vnvk/RzAHPdz0T6fY5lk2jcZquoHwJfp\nVeemZ6rJSSaAfxheCZel+e7fVuC5ffsdBDy4xGVbdqrqB9UauYH/zo4mB+/nbiRZRe8/xE9X1dUt\n7Odzgea6n4v5+Ry7JJPk6S0rk+QZwAnAbcBG4Ky225nA1XOeQDPCzt9a+u/fWey4fxuBM2D70xe2\nzTRbaCc73c/2H+GM3wK+05Y3Aqcn2S/JwcAL6E0w1g5/BtxeVRf1xfx8LtyT7udifj7Hbp5Me+Nf\noleFWwV8pqo+lOTZwOfpZeH7gNOqatvwSjq6klwBTALPAabpjTT5MvAF5rh/ST4GnAT8BDi7qm4e\nQrFH1jz38zh67d9PAPcAb5v5zy/JecA5wGP0mi82L32pR1OSVwJfp/fFceYhcefT+49uzr9vP5/z\n28X9/G0W6fM5dklGkjQ6xq65TJI0OkwykqTOmGQkSZ0xyUiSOmOSkSR1xiQjSeqMSUaS1BmTjCSp\nM/8flUvxQMNUllgAAAAASUVORK5CYII=\n",
      "text/plain": [
       "<matplotlib.figure.Figure at 0x7f51d01f7ad0>"
      ]
     },
     "metadata": {},
     "output_type": "display_data"
    }
   ],
   "source": [
    "full_df.height.plot.hist(100)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 6,
   "metadata": {
    "collapsed": true
   },
   "outputs": [],
   "source": [
    "full_df.loc[(full_df.height < 100), 'height'] += 100"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "#####  arterial pressure fix"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 7,
   "metadata": {
    "collapsed": true
   },
   "outputs": [],
   "source": [
    "full_df['ap_hi'] = full_df.ap_hi.abs()\n",
    "full_df['ap_lo'] = full_df.ap_lo.abs()"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 8,
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/plain": [
       "array([  80.,  215.])"
      ]
     },
     "execution_count": 8,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "np.percentile(full_df.ap_hi, [0.3, 99.9])"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 9,
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/plain": [
       "array([  60.,  110.])"
      ]
     },
     "execution_count": 9,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "np.percentile(full_df.ap_lo, [1, 98])"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 10,
   "metadata": {
    "collapsed": true
   },
   "outputs": [],
   "source": [
    "while np.any(full_df.ap_hi > 800):\n",
    "    full_df.loc[full_df.ap_hi > 800, 'ap_hi'] /= 10"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 11,
   "metadata": {
    "collapsed": true
   },
   "outputs": [],
   "source": [
    "for i in range(3):\n",
    "    full_df.loc[full_df.ap_hi < 22, 'ap_hi'] *= 10"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 12,
   "metadata": {
    "collapsed": true
   },
   "outputs": [],
   "source": [
    "while np.any(full_df.ap_lo > 600):\n",
    "    full_df.loc[full_df.ap_lo > 600, 'ap_lo'] /= 10"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 13,
   "metadata": {
    "collapsed": true
   },
   "outputs": [],
   "source": [
    "for i in range(3):\n",
    "    full_df.loc[full_df.ap_lo < 11, 'ap_lo'] *= 10"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "##  Adding features"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 14,
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/html": [
       "<div>\n",
       "<style>\n",
       "    .dataframe thead tr:only-child th {\n",
       "        text-align: right;\n",
       "    }\n",
       "\n",
       "    .dataframe thead th {\n",
       "        text-align: left;\n",
       "    }\n",
       "\n",
       "    .dataframe tbody tr th {\n",
       "        vertical-align: top;\n",
       "    }\n",
       "</style>\n",
       "<table border=\"1\" class=\"dataframe\">\n",
       "  <thead>\n",
       "    <tr style=\"text-align: right;\">\n",
       "      <th></th>\n",
       "      <th>active</th>\n",
       "      <th>age</th>\n",
       "      <th>alco</th>\n",
       "      <th>ap_hi</th>\n",
       "      <th>ap_lo</th>\n",
       "      <th>cardio</th>\n",
       "      <th>cholesterol</th>\n",
       "      <th>gender</th>\n",
       "      <th>gluc</th>\n",
       "      <th>height</th>\n",
       "      <th>id</th>\n",
       "      <th>smoke</th>\n",
       "      <th>weight</th>\n",
       "    </tr>\n",
       "  </thead>\n",
       "  <tbody>\n",
       "    <tr>\n",
       "      <th>0</th>\n",
       "      <td>1.0</td>\n",
       "      <td>18393</td>\n",
       "      <td>0.0</td>\n",
       "      <td>110.0</td>\n",
       "      <td>80.0</td>\n",
       "      <td>0.0</td>\n",
       "      <td>1</td>\n",
       "      <td>2</td>\n",
       "      <td>1</td>\n",
       "      <td>168</td>\n",
       "      <td>0</td>\n",
       "      <td>0.0</td>\n",
       "      <td>62.0</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>1</th>\n",
       "      <td>1.0</td>\n",
       "      <td>20228</td>\n",
       "      <td>0.0</td>\n",
       "      <td>140.0</td>\n",
       "      <td>90.0</td>\n",
       "      <td>1.0</td>\n",
       "      <td>3</td>\n",
       "      <td>1</td>\n",
       "      <td>1</td>\n",
       "      <td>156</td>\n",
       "      <td>1</td>\n",
       "      <td>0.0</td>\n",
       "      <td>85.0</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>2</th>\n",
       "      <td>0.0</td>\n",
       "      <td>18857</td>\n",
       "      <td>0.0</td>\n",
       "      <td>130.0</td>\n",
       "      <td>70.0</td>\n",
       "      <td>1.0</td>\n",
       "      <td>3</td>\n",
       "      <td>1</td>\n",
       "      <td>1</td>\n",
       "      <td>165</td>\n",
       "      <td>2</td>\n",
       "      <td>0.0</td>\n",
       "      <td>64.0</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>3</th>\n",
       "      <td>1.0</td>\n",
       "      <td>17623</td>\n",
       "      <td>0.0</td>\n",
       "      <td>150.0</td>\n",
       "      <td>100.0</td>\n",
       "      <td>1.0</td>\n",
       "      <td>1</td>\n",
       "      <td>2</td>\n",
       "      <td>1</td>\n",
       "      <td>169</td>\n",
       "      <td>3</td>\n",
       "      <td>0.0</td>\n",
       "      <td>82.0</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>4</th>\n",
       "      <td>0.0</td>\n",
       "      <td>17474</td>\n",
       "      <td>0.0</td>\n",
       "      <td>100.0</td>\n",
       "      <td>60.0</td>\n",
       "      <td>0.0</td>\n",
       "      <td>1</td>\n",
       "      <td>1</td>\n",
       "      <td>1</td>\n",
       "      <td>156</td>\n",
       "      <td>4</td>\n",
       "      <td>0.0</td>\n",
       "      <td>56.0</td>\n",
       "    </tr>\n",
       "  </tbody>\n",
       "</table>\n",
       "</div>"
      ],
      "text/plain": [
       "   active    age  alco  ap_hi  ap_lo  cardio  cholesterol  gender  gluc  \\\n",
       "0     1.0  18393   0.0  110.0   80.0     0.0            1       2     1   \n",
       "1     1.0  20228   0.0  140.0   90.0     1.0            3       1     1   \n",
       "2     0.0  18857   0.0  130.0   70.0     1.0            3       1     1   \n",
       "3     1.0  17623   0.0  150.0  100.0     1.0            1       2     1   \n",
       "4     0.0  17474   0.0  100.0   60.0     0.0            1       1     1   \n",
       "\n",
       "   height  id  smoke  weight  \n",
       "0     168   0    0.0    62.0  \n",
       "1     156   1    0.0    85.0  \n",
       "2     165   2    0.0    64.0  \n",
       "3     169   3    0.0    82.0  \n",
       "4     156   4    0.0    56.0  "
      ]
     },
     "execution_count": 14,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "full_df.head()"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 15,
   "metadata": {
    "collapsed": true
   },
   "outputs": [],
   "source": [
    "full_df['bmi'] = full_df.weight / ((full_df.height / 100) ** 2)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 16,
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/plain": [
       "<matplotlib.axes._subplots.AxesSubplot at 0x7f519bb35f50>"
      ]
     },
     "execution_count": 16,
     "metadata": {},
     "output_type": "execute_result"
    },
    {
     "data": {
      "image/png": "iVBORw0KGgoAAAANSUhEUgAAAZkAAAEACAYAAABhzAtFAAAABHNCSVQICAgIfAhkiAAAAAlwSFlz\nAAALEgAACxIB0t1+/AAAHG9JREFUeJzt3X2QFfWd7/H3RxCfYhDdlbmBCKLiQ3YTFzfK3Tx41Kig\npcSqNdG9RjQm68Y8uEltIuitMKncuhHren2oXGPuLuuCGyVREyUVS9HSsylrA2iUgIo4lk+AYbIa\nIDduyiB87x/9G2yPZ/CcmenpnjOfV9WU3d/uPv09LXO+83s43YoIzMzMirBH2QmYmVnncpExM7PC\nuMiYmVlhXGTMzKwwLjJmZlYYFxkzMytMoUVG0iJJvZLWNMS/LOkZSWslXZ2Lz5fUI2mdpNNy8Vlp\n/2clXZGLT5W0QtJ6SbdLGlvk+zEzs/YU3ZK5BTg9H5BUA84C/iwi/hz4Xyl+NPAp4GhgNnCTMnsA\n302v8wHgfElHpZdbCFwbEUcCW4FLCn4/ZmbWhkKLTEQ8AmxpCH8BuDoi3kz7vJric4ClEfFmRLwI\n9ADHp5+eiHgpIrYDS9O+ACcDd6XlxcA5Rb0XMzNrXxljMtOBj6duroclHZfik4ANuf02pVhjfCMw\nSdJBwJaI2JmLv6/Y1M3MrB1ljGGMBQ6IiJmSPgzcAUwD1GTfoHkhjLR/4zG+R46ZWYWUUWQ2AD8G\niIhHJe1IrZKNwCG5/SYDr5AVknfEI+JVSQdI2iO1Zvr2b0qSC5CZ2QBERLNGQEuGo7usscVxN3AK\ngKTpwLiIeA1YBnxa0jhJhwKHA6uAR4HDJU2RNA44D7gnvdZDwLlpeW4u3lREVP5nwYIFpefQCTk6\nT+dZ9Z+RkudgFT2F+Tbg34Hpkl6WdDHwz8A0SWuB24ALASLiaeBHwNPAvcBlkdkBfAlYDjxFNjng\nmXSKecDXJD0LHAgsKvL9lKmrayqSkERX19Sy0zEza0mh3WUR8Tf9bPpMP/t/B/hOk/h9wJFN4i8A\nJwwmx5Git/cl+oacensH3HI1MxtW/sZ/xdRqtbJTeFcjIUdwnkPNeQ6tkZLnYGko+txGAkkxkt+r\nJN6aPKch6Ss1M3s3koiKD/ybmdko5SJjZmaFcZExM7PCuMiYmVlhXGTMzKwwLjJmZlYYFxkzMyuM\ni4yZmRXGRcbMzArjImNmZoVxkTEzs8K4yIxIe/m2/2Y2IvgGmSNE4w0yfbNMMxsOvkGmmZlVlouM\nmZkVxkXGzMwK4yJjZmaFKbTISFokqVfSmibb/kHSTkkH5mI3SuqRtFrSsbn4XEnPSlov6cJcfIak\nNWnb9UW+FzMza1/RLZlbgNMbg5ImA58AXsrFZgOHRcQRwKXAzSk+Afgm8GHgBGCBpPHpsO8Bn4uI\n6cB0Se84l5mZlafQIhMRjwBbmmy6Dvh6Q2wOsCQdtxIYL2kiWZFaHhHbImIrsByYJakL2D8iVqXj\nlwCfLOBtmJnZAA37mIyks4ANEbG2YdMkYENufWOKNcY35eIbm+xvZmYVMXY4TyZpH+Aq4NRmm5us\nR5M47xI3M7OKGNYiAxwGTAV+pewr7JOBxyUdT9YSeX9u38nAKylea4g/vJv9+9Xd3b1ruVarUavV\n+t3XzGw0qtfr1Ov1IXu9wm8rI2kq8NOI+PMm214AZkTEFklnAF+MiDMlzQSuj4iZaeD/MWAGWffe\nY8BxEbFV0krgy8CjwM+AGyPivn7y8G1lzMzaVOnbyki6Dfh3splfL0u6uGGXXd1eEXEv8IKk54Dv\nA5el+Bbg22TFZSXwrTQBgLTPIuBZoKe/AmNmZuXwDTJHCLdkzKwMlW7JmJnZ6OYiY2ZmhXGRMTOz\nwrjImJlZYVxkzMysMC4yZmZWGBcZMzMrjIuMmZkVxkXGzMwK4yJjZmaFcZExM7PCuMiYmVlhXGTM\nzKwwLjJmZlYYFxkzMyuMi4yZmRXGRcbMzArjImNmZoVxkTEzs8IUWmQkLZLUK2lNLnaNpHWSVku6\nS9J7c9vmS+pJ20/LxWdJekbSs5KuyMWnSlohab2k2yWNLfL9mJlZe4puydwCnN4QWw58ICKOBXqA\n+QCSjgE+BRwNzAZuUmYP4LvpdT4AnC/pqPRaC4FrI+JIYCtwScHvx8zM2lBokYmIR4AtDbEHI2Jn\nWl0BTE7LZwNLI+LNiHiRrAAdn356IuKliNgOLAXmpGNOBu5Ky4uBc4p6L2Zm1r6yx2Q+C9yblicB\nG3LbNqVYY3wjMEnSQcCWXMHaCLyv2HTNzKwdpY1hSLoK2B4Rt/eFmuwWNC+EkfZvPCZ2d87u7u5d\ny7VajVqt1mK2ZmajQ71ep16vD9nrKWK3n8uDP4E0BfhpRHwwF5sL/C1wckS8kWLzgIiIhWn9PmAB\nWSHpjohZjftJ+g9gYkTslDQTWBARs/vJI4p+r0WSxFs19O3LI/l9mVm1SSIimjUCWjIc3WVva3FI\nmgV8Azi7r8Aky4DzJI2TdChwOLAKeBQ4XNIUSeOA84B70jEPAeem5bm5uJmZVUChLRlJtwE14CCg\nl6xlciUwDngt7bYiIi5L+88nmyG2Hbg8Ipan+CzgBrKiuCgirk7xQ8kmAkwAngAuSJMDmuXiloyZ\nWZsG25IpvLusKjq3yOwNZA3CiROnsHnzi8Oem5l1LheZFnVukXGrxsyKMxLGZMzMbJRykTEzs8K4\nyJiZWWFcZMzMrDAuMmZmVhgXGTMzK4yLjJmZFcZFxszMCuMiY2ZmhXGRMTOzwrjImJlZYVxkzMys\nMC4yZmZWGBcZMzMrjIuMmZkVxkXGzMwK4yLTUfZCEpLo6ppadjJmZn4y5kjR6pMx/ZRMMxtKlX4y\npqRFknolrcnFJkhaLmm9pPsljc9tu1FSj6TVko7NxedKejYdc2EuPkPSmrTt+iLfi5mZta/o7rJb\ngNMbYvOAByPiSOAhYD6ApNnAYRFxBHApcHOKTwC+CXwYOAFYkCtM3wM+FxHTgemSGs9lZmYlKrTI\nRMQjwJaG8BxgcVpenNb74kvScSuB8ZImkhWp5RGxLSK2AsuBWZK6gP0jYlU6fgnwycLejJmZta2M\ngf+DI6IXICI2Awen+CRgQ26/jSnWGN+Ui29ssr+ZmVXE2LITyGkcWOobxW424LS7eL+6u7t3Lddq\nNWq1WlsJmpl1unq9Tr1eH7LXK3x2maQpwE8j4oNpfR1Qi4je1OX1cEQcLenmtPzDtN8zwInASWn/\nv0vxm4GHgX/rOzbFzwNOjIgv9JOHZ5eZmbWp0rPLEvH2Vscy4KK0fBFwTy5+IYCkmcDW1K12P3Cq\npPFpEsCpwP2pq+13ko5X9gl8Ye61zMysAgrtLpN0G1ADDpL0MrAAuBq4Q9JngZeBcwEi4l5JZ0h6\nDngduDjFt0j6NvAY2Z/p30oTAAAuA/4F2Bu4NyLuK/L9mJlZe1rqLpP0ZxHx5DDkUxh3l5mZtW+4\nustulrRK0mWSDhjoyczMbHRpqchExEeB/wa8H3hM0m2STi00MzMzG/Haml0maQzZFx5vBH5H1j9z\nZUT8uJj0ho67y8zM2jcs3WWSPijpOmAdcDJwVpo6fDJw3UBPbmZmna3Vgf+fA/8I3BkRf2jY9pmI\nuLWg/IaMWzJmZu0bbEum1SLzHuAPEbEjre8B7B0R/znQEw83Fxkzs/YN1+yyB4F9cuv7ppiZmVm/\nWi0ye0fE7/tW0vK+xaRkZmadotUi87qkGX0rko4D/rCb/c3MzFq+rczfk90K5pW0/l+ATxeTkpmZ\ndYqWvycjaU/gSLLR5WciYnuRiQ01D/ybmbVvWGaXpRP9FTCVXOsnIpYM9MTDzUXGzKx9gy0yLXWX\nSboVOAxYDexI4SA9LtnMzKyZVsdk/hI4ZkQ3BczMbNi1OrvsSaCryETMzKzztNqS+RPgaUmrgDf6\nghFxdiFZmZlZR2i1yHQXmYSZmXWmdmaXTQGOiIgHJe0LjImI/1dodkPIs8vMzNo3XLf6/zxwJ/D9\nFJoE3D3Qk5qZ2ejQ6sD/F4GPkD2ojIjoAQ4ezIklfVXSk5LWSPqBpHGSpkpaIWm9pNsljU37jpO0\nVFKPpF9IOiT3OvNTfJ2k0waTU5V0dU1F0q4fM7ORqNUi80ZE/LFvJX34D7gvRtL7gC8DMyLig2Rj\nQ+cDC4FrI+JIYCtwSTrkEuC3EXEEcD1wTXqdY4BPAUcDs4Gb1CGfyL29L5Fd4r4fM7ORp9Ui82+S\nrgT2kXQqcAfw00GeewywXypY+wCvACcBd6Xti8ke9QwwJ61D1m13clo+G1gaEW9GxItAD3D8IPMy\nM7Mh0mqRmQf8B7AWuBS4F/jvAz1pRLwCXAu8DGwCtgGPA1sjYmfabSPZ2A/pvxvSsTuAbZIOzMeT\nTbljzMysZC1NYU4f/P+YfgZN0gFkrZMpZAXmDrLurnecuu+Qfrb1F2+qu7t713KtVqNWq7WUr5nZ\naFGv16nX60P2eq0+fvkFmnx4R8S0AZ1U+mvg9Ij4fFr/DPBfgb8GuiJip6SZwIKImC3pvrS8UtIY\n4NcRcbCkeVkasTC9zq79mpxzRE1hfvuUZWh12rKnMJvZUBqWG2SS3busz97AucCBAz0pWTfZTEl7\nk91B4BTgUeCg9No/BOYC96T9l6X1lWn7Q7n4DyRdR9ZNdjiwahB5mZnZEGr5y5jvOFD6ZUQcN+AT\nSwuA84DtwBPA54DJwFJgQopdEBHbJe0F3Ar8BfAacF4a6EfSfLLZZ9uByyNieT/nc0vGzKxNw/I8\nmfyjl8kmC/wl8IWI+NBATzzcXGTMzNo3XN1l1+aW3wReJPt+ilXWXru+xDlx4hQ2b36x3HTMbFQa\ncHfZSDMaWzJu1ZjZYA3XkzG/trvtEfG/B5qAmZl1rnZml32YbDYXwFlks7h6ikjKzMw6Q6sD/z8H\nzuy7tb+k/YGfRcTHC85vyLi7bOS8dzOrjmG51T8wEfhjbv2PKWZmZtavVrvLlgCrJP2E7M/jc3jr\nhpVmZmZNtfNkzBnAx9LqzyPiicKyKoC7y0bOezez6hiu7jKAfYHfRcQNwEZJhw70pGZmNjq0+vjl\nBcAVwPwU2hP416KSMjOzztBqS+YcsgeEvQ67ngezf1FJmZlZZ2i1yPwxDWgEgKT9ikvJzMw6RatF\n5keSvg8cIOnzwIMM0QPMzMysc7Uzu+xU4DSyaUv3R8QDRSY21Eb37LK+x/b4Zplm1p7Cb/WfnkT5\nYEScNNCTVMHoLjKezmxmA1P4FOaI2AHslDR+oCcxM7PRqdVv/P8eWCvpAdIMM4CI+EohWZmZWUdo\ntcj8OP2YmZm1bLdjMpIOiYiXhzGfwnhMxmMyZta+osdk7s6d6K6BnqQZSeMl3SFpnaSnJJ0gaYKk\n5ZLWS7o/Pw4k6UZJPZJWSzo2F58r6dl0zIVDmaOZmQ3OuxWZfPWaNsTnvgG4NyKOBj4EPAPMI5vJ\ndiTwEOk2NpJmA4dFxBHApcDNKT4B+CbZA9VOABZ4goKZWXW8W5GJfpYHJT307GMRcQtARLwZEduA\nObz1CIHFaZ303yVp35XAeEkTgdOB5RGxLSK2AsuBWUOVp5mZDc67Dfx/SNLvyFo0+6Rl0npExHsH\neN5pwKuSbiFrxTwG/D0wMSJ6yV58s6SD0/6TgA254zemWGN8U4qZmVkF7LbIRMSYAs87A/hiRDwm\n6TqyrrL+WkuNg059I9nNBqP6bXF1d3fvWq7VatRqtdYzNjMbBer1OvV6fcher+Xbygyl1NX1i4iY\nltY/SlZkDgNqEdErqQt4OCKOlnRzWv5h2v8Z4ETgpLT/36X42/ZrOKdnl6XlkXQdzKxcw/nQsiGT\nusQ2SJqeQqcATwHLgItS7CLgnrS8DLgQQNJMYGt6jfuBU9NMtQnAqSlmZmYV0OqXMYvwFeAHkvYE\nngcuBsaQ3fH5s8DLwLkAEXGvpDMkPUd2x4GLU3yLpG+TjekE8K00AcDMzCqglO6yMri7zN1lZta+\nEdldZmZmo4OLjJmZFcZFxszMCuMiY2ZmhXGRMTOzwrjImJlZYVxkzMysMC4yZmZWGBcZMzMrjIvM\nqLMXkpBEV9fUspMxsw7n28pUVJG3lfEtZsysVb6tjJmZVZaLjJmZFcZFxszMCuMiY2ZmhXGRMTOz\nwrjImJlZYVxkzMysMC4yZmZWmFKLjKQ9JD0uaVlanypphaT1km6XNDbFx0laKqlH0i8kHZJ7jfkp\nvk7SaWW9FzMze6eyWzKXA0/n1hcC10bEkcBW4JIUvwT4bUQcAVwPXAMg6RjgU8DRwGzgJmVflTcz\nswoorchImgycAfxTLnwycFdaXgx8Mi3PSesAd6b9AM4GlkbEmxHxItADHF9g2mZm1oYyWzLXAV8n\n3UhL0kHAlojYmbZvBCal5UnABoCI2AFsk3RgPp5syh1jZmYlG1vGSSWdCfRGxGpJtb5w+snL39Wx\nUewm3lR3d/eu5VqtRq1W62/XUnR1TaW396Wy0zCzUaxer1Ov14fs9Uq5C7Ok/wlcALwJ7APsD9wN\nnAZ0RcROSTOBBRExW9J9aXmlpDHAryPiYEnzgIiIhel1d+3X5JyVvwvz2++87Lswm1n5RuRdmCPi\nyog4JCKmAecBD0XEBcDDwLlpt7nAPWl5WVonbX8oFz8vzT47FDgcWDUc78HMzN5dKd1luzEPWCrp\n28ATwKIUXwTcKqkHeI2sMBERT0v6EdkMte3AZZVvrpiZjSJ+aFmFDH932d7AGwBMnDiFzZtfHIJ3\nYWadZLDdZS4yFVLGmIzHZ8xsd0bkmIyZmY0OLjJmZlYYFxkzMyuMi4yZmRXGRcbMzArjImNmZoVx\nkTEzs8K4yJiZWWFcZMzMrDAuMmZmVhgXGUv2QtKun66uqWUnZGYdwPcuq5Cy713WeL6qXy8zK57v\nXWZmZpXlImNmZoVxkTEzs8K4yJiZWWFcZMzMrDClFBlJkyU9JOlpSWslfSXFJ0haLmm9pPsljc8d\nc6OkHkmrJR2bi8+V9Gw65sIy3o+ZmTVXyhRmSV1AV0SslvQe4JfAHOBi4LWIuEbSFcCEiJgnaTbw\npYg4U9IJwA0RMVPSBOAxYAbZHNxfAjMiYluTc3oKc8vL2XrVr5eZFW9ETmGOiM0RsTot/x5YB0wm\nKzSL026L0zrpv0vS/iuB8ZImAqcDyyNiW0RsBZYDs4btjZiZ2W6VPiYjaSpwLLACmBgRvZAVIuDg\ntNskYEPusI0p1hjflGJmZlYBpRaZ1FV2J3B5atH01z/T2FTr69tp1oRzH4+ZWUWMLevEksaSFZhb\nI+KeFO6VNDEietO4zW9SfCPw/tzhk4FXUrzWEH+4v3N2d3fvWq7VatRqtf52NTMbler1OvV6fche\nr7R7l0laArwaEV/LxRYCv42IhZLmAQekgf8zgC+mgf+ZwPVNBv73SMvHpfGZxvN54L/lZYC9gTcA\nmDhxCps3vziAd2RmI91gB/7Lml32EeDnwFqyT7YArgRWAT8ia7W8DJzbVzAkfZdsUP914OKIeDzF\nLwKuSq/xPyJiST/ndJFpefmd26p+7cysGCOyyJTBRcZFxszaNyKnMNtburqm7nqGi5lZp3FLpmT9\nt17ckjGz8rklY2ZmleUiY2ZmhXGRsRbstWvcqKtratnJmNkI4jGZko2UMRmPz5iNTh6TMTOzynKR\nMTOzwrjImJlZYVxkzMysMC4yZmZWGBcZa5OnM5tZ6zyFuWQjcQqzpzObjR6ewmxmZpXlImOD4K4z\nM9s9F5kSdM7t/d+g75lzvb0vlZ2MmVWQx2RK0No4zMgYk/H4jFln85iMmZlVlouMDRGPz5jZO3VE\nkZE0S9Izkp6VdEXZ+YxO+fGZzS44ZgZ0QJGRtAfwXeB04APA+ZKOKjerwaiXnUAL6u+yvRoTAur1\nemnnbofzHFrOs1pGfJEBjgd6IuKliNgOLAXmlJzTO7Q+o6w+HOkMUr2NfcvrRhspv8TOc2g5z2rp\nhCIzCdiQW9+YYpWS/UUfvH0G12jQvBtNEmPG7OduNbMO1wlFplnToOVP8gceeOBtH3w/+cndQ5ZY\n53wfZqi8VXAg2LnzP2lWgPLFp5VlFyiz6hrx35ORNBPojohZaX0eEBGxsGG/kf1GzcxKMpjvyXRC\nkRkDrAdOAX4NrALOj4h1pSZmZmaMLTuBwYqIHZK+BCwn6/5b5AJjZlYNI74lY2Zm1dUJA/+7VdUv\nakqaLOkhSU9LWivpKyk+QdJySesl3S9pfNm5QvZ9JEmPS1qW1qdKWpHyvF1S6a1iSeMl3SFpnaSn\nJJ1Qtesp6auSnpS0RtIPJI2ryrWUtEhSr6Q1uVi/10/SjZJ6JK2WdGyJOV6T/p+vlnSXpPfmts1P\nOa6TdNpw5Nhfnrlt/yBpp6QDc7Fhv5a7y1PSl9Pn5lpJV+fi7V/PiOjYH7Ii+hwwBdgTWA0cVXZe\nKbcu4Ni0/B6ycaWjgIXAN1L8CuDqsnNNuXwV+FdgWVr/IXBuWv4ecGkFcvwX4OK0PBYYX6XrCbwP\neB4Yl7uGc6tyLYGPAscCa3KxptcPmA38LC2fAKwoMcdPAHuk5auB76TlY4An0r+FqemzQGXlmeKT\ngfuAF4ADy7yWu7meNbLhh7Fp/U/Sf48eyPXs9JZMZb+oGRGbI2J1Wv49sI7sH+AcYHHabTHwyXIy\nfIukycAZwD/lwicDd6XlxcA5w51XnqT9gY9FxC0AEfFmRGyjetdzDLBfaq3sA7wCnEQFrmVEPAJs\naQg3Xr85ufiSdNxKYLykiWXkGBEPRsTOtLqC7PcI4Gxgafq38CLQQ/aZULh+riXAdcDXG2KlXMt0\nvmZ5foHsj4k30z6v5vJs+3p2epEZEV/UlDSV7K+JFcDEiOiFrBABf1peZrv0/WIEgKSDgC25X+yN\nZH+ll2ka8KqkW1K33v+VtC8Vup4R8QpwLfAysAnYBjwObK3Ytcw7uOH6HZzijb9bm6jG79ZngXvT\ncqVylHQWsCEi1jZsqlSewHTg46kL92FJx6X4gPLs9CIzqC9qDgdJ7wHuBC5PLZqq5Xcm0JtaXX3X\nU7zz2pad91hgBvB/ImIG8Dowj/Lz2kXSAWR/DU4hKyT7kXWVNKpMzrtRud8tSVcB2yPi9r5Qk91K\nyVHSPsBVwIJmm5vEyryWY4EDImIm8A3gjhQfUJ6dXmQ2Aofk1ieTdU9UQuoyuRO4NSLuSeHevqay\npC7gN2Xll3wEOFvS88DtZN1k15M16fv+/VThum4k+yvxsbR+F1nRqdL1/ATwfET8NiJ2AD8B/go4\noGLXMq+/67cReH9uv1LzljSXrEv3b3LhKuV4GNk4xq8kvZByeVzSwVQrT8haKz8GiIhHgR2p92JA\nn6edXmQeBQ6XNEXSOOA8YFnJOeX9M/B0RNyQiy0DLkrLc4F7Gg8aThFxZUQcEhHTyK7fQxFxAfAw\ncG7arQp59gIbJE1PoVOAp6jW9XwZmClpb0nirRyrdC0bW6n563cRb+W2DLgQdt11Y2tft9oweFuO\nkmaR/cV9dkS8kdtvGXBemsF3KHA42Ze1h8uuPCPiyYjoiohpEXEo2Qf2X0TEbyj3Wr4tz+Rusn+b\npN+ncRHxWsrz021fz+GaxVDWDzCLbOZWDzCv7HxyeX0E2EE24+0Jsr75WcCBwIMp5wfImq2l55ty\nPpG3ZpcdCqwEniWbHbVnBfL7ENkfFqvJ/hIbX7XrSdZdsg5YQzaQvmdVriVwG9lfpm+QFcSLgQn9\nXT+yR2w8B/wKmFFijj3AS+l36HHgptz+81OO64DTyryWDdufJ80uK+ta7uZ6jgVuBdYCjwEnDuZ6\n+suYZmZWmE7vLjMzsxK5yJiZWWFcZMzMrDAuMmZmVhgXGTMzK4yLjJmZFcZFxszMCuMiY2Zmhfn/\n282XXXvboSEAAAAASUVORK5CYII=\n",
      "text/plain": [
       "<matplotlib.figure.Figure at 0x7f519baac390>"
      ]
     },
     "metadata": {},
     "output_type": "display_data"
    }
   ],
   "source": [
    "full_df.bmi.plot.hist(100)"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "#  Split back"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 17,
   "metadata": {
    "collapsed": true
   },
   "outputs": [],
   "source": [
    "train_df = full_df[~full_df.cardio.isnull()]\n",
    "test_df = full_df[full_df.cardio.isnull()].drop('cardio', axis=1)"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "##  Model train"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 18,
   "metadata": {
    "collapsed": true
   },
   "outputs": [],
   "source": [
    "columns_with_nulls = test_df.columns[test_df.isnull().sum(axis=0) > 0]"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 19,
   "metadata": {
    "collapsed": true
   },
   "outputs": [],
   "source": [
    "X = train_df.drop(['id','cardio'], axis=1)\n",
    "y = train_df.cardio"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 20,
   "metadata": {
    "collapsed": true
   },
   "outputs": [],
   "source": [
    "X_test = test_df.drop(['id'], axis=1)\n",
    "X_test = X_test.fillna(-999)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [
    {
     "data": {
      "application/vnd.jupyter.widget-view+json": {
       "model_id": "686f8be9dc43464caf3fd2ae03dee39c"
      }
     },
     "metadata": {},
     "output_type": "display_data"
    },
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "\n"
     ]
    }
   ],
   "source": [
    "models = []\n",
    "\n",
    "np.random.seed(42)\n",
    "for i in tqdm.tqdm_notebook(range(300)):\n",
    "    X_train = X.copy()\n",
    "    for col in columns_with_nulls:\n",
    "        _idx = np.random.choice(X_train.index, size=X_train.shape[0]//10, replace=False)\n",
    "        X_train.loc[_idx, col] = -999\n",
    "    \n",
    "    model = CatBoostClassifier(\n",
    "        iterations=500 + np.random.randint(1000),\n",
    "        depth=4 + np.random.randint(4),\n",
    "        thread_count=12,\n",
    "        border_count=128,\n",
    "        learning_rate=0.010 + (np.random.rand() * 1e-2),\n",
    "        random_seed=np.random.randint(10**10)\n",
    "    )\n",
    "    model.fit(X_train, y)\n",
    "    models.append(model.copy())"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [
    {
     "data": {
      "application/vnd.jupyter.widget-view+json": {
       "model_id": "5cec382dd75c47f48a70e0f74993a5cd"
      }
     },
     "metadata": {},
     "output_type": "display_data"
    }
   ],
   "source": [
    "# models2 = []\n",
    "\n",
    "np.random.seed(42)\n",
    "for i in tqdm.tqdm_notebook(range(100,300)):\n",
    "    X_train = X.copy()\n",
    "    for col in columns_with_nulls:\n",
    "        _idx = np.random.choice(X_train.index, size=X_train.shape[0]//10, replace=False)\n",
    "        X_train.loc[_idx, col] = -999\n",
    "    \n",
    "    model = lightgbm.LGBMClassifier(\n",
    "        seed=np.random.randint(10**10),\n",
    "        n_estimators=3680,\n",
    "        max_depth=15 + np.random.randint(0,10),\n",
    "        num_leaves=20 + np.random.randint(0,10),\n",
    "        subsample=0.99868,\n",
    "        colsample_bytree=0.8022,\n",
    "        reg_alpha=26.4310,\n",
    "        reg_lambda=19.7836,\n",
    "        max_bin=8850,\n",
    "        objective='binary',\n",
    "        nthread=12\n",
    "    )\n",
    "    model.fit(X_train, y)\n",
    "    models2.append(model)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "predictions = []\n",
    "for _model in tqdm.tqdm_notebook(models):\n",
    "    predictions.append(_model.predict_proba(X_test)[:,1])\n",
    "    \n",
    "predictions = np.vstack(predictions).T"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "predictions2 = []\n",
    "for _model in tqdm.tqdm_notebook(models2):\n",
    "    predictions2.append(_model.predict_proba(X_test)[:,1])\n",
    "    \n",
    "predictions2 = np.vstack(predictions2).T"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {
    "collapsed": true
   },
   "outputs": [],
   "source": [
    "prediction = np.hstack([predictions, predictions2]).mean(axis=1)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {
    "collapsed": true
   },
   "outputs": [],
   "source": [
    "pd.DataFrame(prediction).to_csv(\n",
    "    'submission_{date}.csv'.format(\n",
    "        date=datetime.today().strftime('%Y%m%d_%H%M')\n",
    "    ),\n",
    "    index=False, header=False\n",
    ")"
   ]
  }
 ],
 "metadata": {
  "kernelspec": {
   "display_name": "Python 2",
   "language": "python",
   "name": "python2"
  },
  "language_info": {
   "codemirror_mode": {
    "name": "ipython",
    "version": 2
   },
   "file_extension": ".py",
   "mimetype": "text/x-python",
   "name": "python",
   "nbconvert_exporter": "python",
   "pygments_lexer": "ipython2",
   "version": "2.7.6"
  },
  "widgets": {
   "state": {
    "298684729f264868a07fbdc28fbb451b": {
     "views": [
      {
       "cell_index": 40
      }
     ]
    },
    "457c04007e0247058f58de51d5922b0c": {
     "views": [
      {
       "cell_index": 31
      }
     ]
    },
    "8ea30877ace841dd80b984ddc27368bd": {
     "views": [
      {
       "cell_index": 38
      }
     ]
    },
    "a6639a7ea13c4d5dbc6a4e5fd346e2c0": {
     "views": [
      {
       "cell_index": 38
      }
     ]
    },
    "f468a7b4d4944cb7984d10fb5e424f9b": {
     "views": [
      {
       "cell_index": 41
      }
     ]
    }
   },
   "version": "1.2.0"
  }
 },
 "nbformat": 4,
 "nbformat_minor": 2
}
